Import data with pandas using read_table
In [129]:
import pandas as pd
df = pd.read_table('smsspamcollection/SMSSpamCollection',
sep='\t',
header=None,
names=['label', 'sms_message'])
df.head()
Out[129]:
Clean the data ( ham:0 spam:1)
In [130]:
df['label'] = df.label.map({'ham':0,'spam':1})
df.head()
Out[130]:
Implement bag of words
In [131]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
In [132]:
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
df['label'],
random_state=1)
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))
In [133]:
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)
In [134]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
Out[134]:
In [140]:
test_sample = []
inputvar=input('Please enter the sms message to classify it as Spam or Not!')
test_sample.append(inputvar)
print(test_sample)
doc_sample = count_vector.transform(test_sample)
predictions = naive_bayes.predict(doc_sample)
prediction = naive_bayes.predict(testing_data)
status = predictions[0]
print(predictions)
if status == 0:
print("Not spam")
else:
print("spam")
In [139]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, prediction)))
print('Precision score: ', format(precision_score(y_test, prediction)))
print('Recall score: ', format(recall_score(y_test, prediction)))
print('F1 score: ', format(f1_score(y_test, prediction)))